Analyze selection data using soluble Ephrin-B2 or -B3¶
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
#input configs
altair_config = None
nipah_config = None
#input files
entropy_file = None
func_scores_E2_file = None
binding_E2_file = None
func_scores_E3_file = None
binding_E3_file = None
#output files
filtered_E2_binding_data = None
filtered_E3_binding_data = None
filtered_E2_binding_low_effect = None
filtered_E3_binding_low_effect = None
#output images
entry_binding_combined_corr_plot = None
entry_binding_combined_corr_plot_agg = None
E2_E3_correlation = None
E2_E3_correlation_site = None
combined_E2_E3_site_corr = None
binding_by_site_plot = None
entry_binding_corr_heatmap = None
binding_corr_heatmap = None
binding_region_boxplot_plot = None
binding_region_bubble_plot = None
max_binding_in_stalk = None
max_binding_in_contact = None
In [2]:
# Parameters
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
entropy_file = "results/entropy/entropy.csv"
func_scores_E2_file = "results/func_effects/averages/CHO_EFNB2_low_func_effects.csv"
binding_E2_file = "results/receptor_affinity/averages/EFNB2_monomeric_mut_effect.csv"
func_scores_E3_file = "results/func_effects/averages/CHO_EFNB3_low_func_effects.csv"
binding_E3_file = "results/receptor_affinity/averages/EFNB3_dimeric_mut_effect.csv"
filtered_E2_binding_data = "results/filtered_data/E2_binding_filtered.csv"
filtered_E3_binding_data = "results/filtered_data/E3_binding_filtered.csv"
filtered_E2_binding_low_effect = (
"results/filtered_data/E2_binding_low_effect_filter.csv"
)
filtered_E3_binding_low_effect = (
"results/filtered_data/E3_binding_low_effect_filter.csv"
)
entry_binding_combined_corr_plot = (
"results/images/entry_binding_combined_corr_plot.html"
)
entry_binding_combined_corr_plot_agg = (
"results/images/entry_binding_combined_corr_plot_agg.html"
)
E2_E3_correlation = "results/images/E2_E3_correlation.html"
E2_E3_correlation_site = "results/images/E2_E3_correlation_site.html"
combined_E2_E3_site_corr = "results/images/combined_E2_E3_site_corr.html"
binding_by_site_plot = "results/images/binding_by_site_plot.html"
entry_binding_corr_heatmap = "results/images/entry_binding_corr_heatmap.html"
binding_corr_heatmap = "results/images/binding_corr_heatmap.html"
binding_region_boxplot_plot = "results/images/binding_region_boxplot_plot.html"
binding_region_bubble_plot = "results/images/binding_region_bubble_plot.html"
max_binding_in_contact = "results/images/max_binding_in_contact.html"
max_binding_in_stalk = "results/images/max_binding_in_stalk.html"
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
pass
print("Already in correct directory")
else:
os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
print("Setup in correct directory")
Setup in correct directory
In [5]:
if nipah_config is None:
##hard paths in case don't want to run with snakemake
print('loading hard paths')
altair_config = "data/custom_analyses_data/theme.py"
nipah_config = "nipah_config.yaml"
entropy_file = 'results/entropy/entropy.csv'
#input files
func_scores_E2_file = "results/func_effects/averages/CHO_EFNB2_low_func_effects.csv"
binding_E2_file = "results/receptor_affinity/averages/EFNB2_monomeric_mut_effect.csv"
func_scores_E3_file = "results/func_effects/averages/CHO_EFNB3_low_func_effects.csv"
binding_E3_file = "results/receptor_affinity/averages/EFNB3_dimeric_mut_effect.csv"
filtered_E2_binding_data="results/filtered_data/E2_binding_filtered.csv"
filtered_E3_binding_data="results/filtered_data/E3_binding_filtered.csv"
filtered_E2_binding_low_effect="results/filtered_data/E2_binding_low_effect_filter.csv"
filtered_E3_binding_low_effect="results/filtered_data/E3_binding_low_effect_filter.csv"
Run config files to setup altair theme and config variables¶
In [6]:
if altair_config:
with open(altair_config, 'r') as file:
exec(file.read())
with open(nipah_config) as f:
config = yaml.safe_load(f)
Make the E2/E3 dataframes, filter separately, then merge¶
In [7]:
#import binding and entry data
e2 = pd.read_csv(binding_E2_file)
e2_func = pd.read_csv(func_scores_E2_file)
e3 = pd.read_csv(binding_E3_file)
e3_func = pd.read_csv(func_scores_E3_file)
In [8]:
def merge_func_binding_dfs(func,binding,name):
df_int = pd.merge(
binding,
func,
on=['site','mutant','wildtype'],
suffixes=['_binding','_cell_entry'],
validate='one_to_one',
how='outer'
).round(3)
df = df_int.rename(columns={'Ephrin binding_mean':'binding_mean','Ephrin binding_std':'binding_std','Ephrin binding_median':'binding_median'})
# Only save relevant columns
df = df[['site','wildtype','mutant','binding_median','binding_std','times_seen_binding','effect','effect_std','times_seen_cell_entry','frac_models']]
def filter_binding_data(df):
df_filter = df[
(df['mutant'] != '*') &
(df['mutant'] != '-') &
(df['site'] != 603) &
# Filter cell entry parameters
(df['effect'] >= config['min_func_effect_for_binding']) &
(df['times_seen_cell_entry'] >= config['func_times_seen_cutoff']) &
(df['effect_std'] <= config['func_std_cutoff']) &
# Filter binding parameters
(df['times_seen_binding'] >= config['min_times_seen_binding']) &
(df['binding_std'] <= config['max_binding_std']) &
(df['frac_models'] >= config['frac_models'])
]
return df_filter
df_filter = filter_binding_data(df)
#For pulling out low effect mutants for heatmaps later. Find mutants below func effect cutoff, but still have ok times_seen and func_std.
def store_filtered_info(df):
df_low_filter = df[
(df['mutant'] != '*') &
(df['mutant'] != '-') &
(df['site'] != 603) &
(df['effect'] < config['min_func_effect_for_binding']) &
(df['times_seen_cell_entry'] >= config['func_times_seen_cutoff']) &
(df['effect_std'] <= config['func_std_cutoff'])
]
return df_low_filter
df_low_effect_filter = store_filtered_info(df)
if name == 'EFNB2':
print(name)
df_filter.to_csv(filtered_E2_binding_data,index=False)
df_low_effect_filter.to_csv(filtered_E2_binding_low_effect,index=False)
else:
df_filter.to_csv(filtered_E3_binding_data,index=False)
df_low_effect_filter.to_csv(filtered_E3_binding_low_effect,index=False)
return df_filter,df_low_effect_filter
#Call filtering function
df_E2_filter,df_E2_filter_missing = merge_func_binding_dfs(e2_func,e2,'EFNB2')
df_E3_filter,df_E3_filter_missing = merge_func_binding_dfs(e3_func,e3,'EFNB3')
#Now that they are filtered, merge EFNB2 and EFNB3
df_binding_effect_merge = pd.merge(
df_E2_filter,
df_E3_filter,
on=['site','wildtype','mutant'],
suffixes=['_E2','_E3'],
how='outer'
)
#display stats
display(df_binding_effect_merge.describe().round(3))
# Make a concat df of E2/E3 data for plotting later
df_E2_filter['selection'] = 'EFNB2'
df_E3_filter['selection'] = 'EFNB3'
df_binding_effect_concat = pd.concat([df_E2_filter,df_E3_filter])
EFNB2
| site | binding_median_E2 | binding_std_E2 | times_seen_binding_E2 | effect_E2 | effect_std_E2 | times_seen_cell_entry_E2 | frac_models_E2 | binding_median_E3 | binding_std_E3 | times_seen_binding_E3 | effect_E3 | effect_std_E3 | times_seen_cell_entry_E3 | frac_models_E3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7489.000 | 6971.000 | 6971.000 | 6971.000 | 6971.000 | 6971.000 | 6971.000 | 6971.000 | 6819.000 | 6819.000 | 6819.000 | 6819.000 | 6819.000 | 6819.000 | 6819.0 |
| mean | 342.073 | -0.324 | 0.511 | 5.919 | -0.139 | 0.375 | 7.353 | 0.997 | -0.019 | 0.186 | 5.720 | -0.104 | 0.400 | 6.313 | 1.0 |
| std | 148.724 | 1.110 | 0.326 | 2.955 | 0.599 | 0.192 | 4.149 | 0.027 | 0.279 | 0.181 | 2.841 | 0.569 | 0.188 | 3.316 | 0.0 |
| min | 71.000 | -5.284 | 0.004 | 2.000 | -2.000 | 0.018 | 2.000 | 0.750 | -1.960 | 0.000 | 2.000 | -1.997 | 0.044 | 2.000 | 1.0 |
| 25% | 215.000 | -0.410 | 0.277 | 4.000 | -0.428 | 0.229 | 4.750 | 1.000 | -0.154 | 0.061 | 4.000 | -0.361 | 0.260 | 4.286 | 1.0 |
| 50% | 340.000 | -0.022 | 0.435 | 5.250 | 0.038 | 0.343 | 6.500 | 1.000 | -0.015 | 0.137 | 5.000 | 0.063 | 0.370 | 5.571 | 1.0 |
| 75% | 468.000 | 0.206 | 0.666 | 7.000 | 0.318 | 0.488 | 8.750 | 1.000 | 0.118 | 0.252 | 7.000 | 0.314 | 0.509 | 7.286 | 1.0 |
| max | 602.000 | 2.602 | 1.989 | 39.750 | 0.640 | 1.000 | 64.380 | 1.000 | 2.008 | 1.958 | 40.000 | 0.660 | 0.998 | 49.140 | 1.0 |
Make nice interactive plot for correlation between binding and entry for EFNB2 and EFNB3¶
In [9]:
def plot_corr_binding_entry_updated(df,flag):
variant_selector = alt.selection_point(
on="mouseover",
empty=False,
fields=["site","mutant"],
value=0
)
variant_selector_agg = alt.selection_point(
on="mouseover",
empty=False,
fields=["site"],
value=0
)
slider = alt.binding_range(min=2, max=10, step=1, name="times seen")
selector = alt.param(name="SelectorName", value=2, bind=slider)
empty_chart = []
#df = df[df['site'].isin(config['contact_sites'])]
for cell in list(df['selection'].unique()):
tmp_df = df[df['selection'] == cell]
if flag == True:
agg_df = tmp_df.groupby('site')[['binding_median','effect']].sum().reset_index()
chart = alt.Chart(agg_df).mark_point(stroke='black',filled=True).encode(
x=alt.X('effect', title=f'Summed {cell} Cell Entry', axis=alt.Axis(grid=True)),
y=alt.Y('binding_median', title=f'Summed {cell} Binding', axis=alt.Axis(grid=True)),
opacity=alt.condition(variant_selector_agg, alt.value(1), alt.value(0.2)),
size=alt.condition(variant_selector_agg,alt.value(100),alt.value(50)),
strokeWidth=alt.condition(variant_selector_agg,alt.value(1),alt.value(0)),
color=alt.condition(variant_selector_agg,alt.value('orange'),alt.value('black')),
tooltip=['site', 'binding_median','effect'],
).properties(
width=200,
height=200,
).add_params(variant_selector_agg)
empty_chart.append(chart)
else:
chart = alt.Chart(tmp_df).mark_point(stroke='black',filled=True).encode(
x=alt.X('effect', title=f'{cell} Cell Entry', axis=alt.Axis(grid=True)),
y=alt.Y('binding_median', title=f'{cell} Binding', axis=alt.Axis(grid=True)),
opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.1)),
size=alt.condition(variant_selector,alt.value(50),alt.value(20)),
strokeWidth=alt.condition(variant_selector,alt.value(1),alt.value(0)),
color=alt.condition(variant_selector,alt.value('orange'),alt.value('black')),
tooltip=['site', 'wildtype', 'mutant','binding_median','times_seen_binding','effect'],
).properties(
width=200,
height=200,
).add_params(variant_selector)
empty_chart.append(chart)
combined_chart = alt.hconcat(*empty_chart,title=alt.Title('Correlation between binding and entry'))
return combined_chart
entry_binding_corr_plot = plot_corr_binding_entry_updated(df_binding_effect_concat,False)
entry_binding_corr_plot.display()
if entry_binding_combined_corr_plot is not None:
entry_binding_corr_plot.save(entry_binding_combined_corr_plot)
entry_binding_corr_plot_agg = plot_corr_binding_entry_updated(df_binding_effect_concat,True)
entry_binding_corr_plot_agg.display()
if entry_binding_combined_corr_plot is not None:
entry_binding_corr_plot_agg.save(entry_binding_combined_corr_plot_agg)
In [10]:
def plot_entry_binding_corr_heatmap(df):
empty_chart = []
for cell in list(df['selection'].unique()):
tmp_df = df[df['selection'] == cell]
chart = alt.Chart(tmp_df,title=f'{cell}').mark_rect().encode(
x=alt.X('effect',title='Cell Entry',axis=alt.Axis(values=[-2,-1,0,1])).bin(maxbins=60),
y=alt.Y('binding_median',title='Binding',axis=alt.Axis(values=[-4,-2,0,2])).bin(maxbins=60),
color=alt.Color('count()',title='Count').scale(scheme='greenblue'),
#tooltip=['effect','binding_median']
)
empty_chart.append(chart)
combined_chart = alt.hconcat(*empty_chart,title=alt.Title('Correlation between binding and entry')).resolve_scale(y='shared',x='shared',color='shared')
return combined_chart
entry_binding_corr_heat = plot_entry_binding_corr_heatmap(df_binding_effect_concat)
entry_binding_corr_heat.display()
if entry_binding_combined_corr_plot is not None:
entry_binding_corr_heat.save(entry_binding_corr_heatmap)
In [11]:
def overall_stats(df,effect,name):
#Find quantiles
quantile_2 = df['binding_median'].quantile(.02)
quantile_98 = df['binding_median'].quantile(.98)
print(f'The 2% quantile for {name} is: {quantile_2}')
print(f'The 98% quantile for {name} is: {quantile_98}')
#Now group sites and find intolerant sites
filtered_df = df.groupby('site').filter(lambda group: (group[effect] <-0.25).all())
unique = filtered_df['site'].unique()
# Convert unique to a Pandas Series
unique_series = pd.Series(unique)
#print(unique_series)
# Find the common elements
unique_contact_bool = unique_series.isin(config['contact_sites'])
#print(unique_contact_bool)
# Filter and get the common elements
common_elements = unique_series[unique_contact_bool]
# Print the common elements
print(f'Here are the contact sites that are conserved: {common_elements}')
print(f'There are {len(unique)} sites with all negative binding score mutants for {name}')
print(f'These are the sites for {name} with all negative binding score mutants: {list(unique)}')
#Now find sites with low and high binding (median)
median_df = df.groupby('site')['binding_median'].median().reset_index().sort_values(by='binding_median')
print(f'For {name}, these are the sites with lowest median binding scores: {median_df.head(5)}')
median_df = df.groupby('site')['binding_median'].median().reset_index().sort_values(by='binding_median',ascending=False)
print(f'For {name}, these are the sites with highest median binding scores: {median_df.head(5)}')
#Now calculate mutant number
total_mutants = df.shape[0]
upper_cutoff = df[df[effect] > 1].sort_values(by='binding_median',ascending=False)
median_upper = upper_cutoff['effect'].median()
print(f'The median entry score for top binders was: {median_upper}')
mutants_above_cutoff_tolerated = upper_cutoff[upper_cutoff['effect'] > 0]
mutants_above_cutoff_tolerated = mutants_above_cutoff_tolerated[['site','effect','binding_median','wildtype','mutant']]
print(f'The mutants with positive entry scores and good binding are: {mutants_above_cutoff_tolerated.head(5)}')
lower_cutoff = df[df[effect] < -1]
print(f'For {name}, there are a total of : {total_mutants} binding mutants')
print(f'For {name}, there are {upper_cutoff.shape[0]} mutants above cutoff, and {mutants_above_cutoff_tolerated.shape[0]} that have good entry scores')
print(f'For {name}, there are {lower_cutoff.shape[0]} mutants below cutoff')
total_sites = df['site'].unique().shape[0]
print(f'The total number of sites are: {total_sites}')
overall_stats(df_E2_filter,'binding_median','E2')
overall_stats(df_E3_filter,'binding_median','E3')
The 2% quantile for E2 is: -3.9 The 98% quantile for E2 is: 1.207199999999999 Here are the contact sites that are conserved: 4 240 5 242 10 389 21 490 22 491 27 501 28 504 29 505 31 531 32 532 33 533 34 557 35 579 36 581 39 588 dtype: int64 There are 41 sites with all negative binding score mutants for E2 These are the sites for E2 with all negative binding score mutants: [116, 220, 236, 238, 240, 242, 243, 248, 346, 351, 389, 390, 398, 400, 438, 439, 441, 460, 467, 486, 487, 490, 491, 494, 495, 497, 500, 501, 504, 505, 526, 531, 532, 533, 557, 579, 581, 584, 585, 588, 590] For E2, these are the sites with lowest median binding scores: site binding_median 446 533 -4.0450 409 494 -4.0250 407 491 -3.9815 406 490 -3.8745 404 487 -3.8715 For E2, these are the sites with highest median binding scores: site binding_median 33 104 1.4065 48 120 1.3630 132 207 1.2080 129 204 1.1770 49 122 1.1750 The median entry score for top binders was: -0.925 The mutants with positive entry scores and good binding are: site effect binding_median wildtype mutant 1324 139 0.005 1.989 N Y 5448 354 0.228 1.498 S T 9533 566 0.081 1.415 F H 7170 444 0.163 1.256 I F 1217 134 0.099 1.249 S I For E2, there are a total of : 6971 binding mutants For E2, there are 229 mutants above cutoff, and 22 that have good entry scores For E2, there are 1040 mutants below cutoff The total number of sites are: 514 The 2% quantile for E3 is: -0.654 The 98% quantile for E3 is: 0.606 Here are the contact sites that are conserved: 3 389 7 501 8 505 9 531 dtype: int64 There are 10 sites with all negative binding score mutants for E3 These are the sites for E3 with all negative binding score mutants: [108, 243, 352, 389, 486, 494, 497, 501, 505, 531] For E3, these are the sites with lowest median binding scores: site binding_median 417 501 -0.9160 444 531 -0.7235 309 389 -0.6870 492 584 -0.6810 419 505 -0.6660 For E3, these are the sites with highest median binding scores: site binding_median 497 589 0.6805 56 129 0.5005 59 132 0.4675 155 231 0.4310 449 537 0.4275 The median entry score for top binders was: -0.764 The mutants with positive entry scores and good binding are: site effect binding_median wildtype mutant 7995 492 0.496 1.200 Q L 2632 211 0.037 1.149 G F 10015 598 0.344 1.141 P G 1655 161 0.234 1.011 S E For E3, there are a total of : 6819 binding mutants For E3, there are 27 mutants above cutoff, and 4 that have good entry scores For E3, there are 12 mutants below cutoff The total number of sites are: 511
Find sites with opposite effects on binding¶
In [12]:
#find sites that are different
def find_biggest_differences(df):
#df = df[df['site'].isin(config['contact_sites'])]
efnb2_good_efnb3_bad = df[
(df['binding_median_E2'] > 0.1) &
(df['binding_median_E3'] < -0.1)
].sort_values(by='binding_median_E2',ascending=False)
display(efnb2_good_efnb3_bad)
efnb2_bad_efnb3_good = df[
(df['binding_median_E2'] < -0.1) &
(df['binding_median_E3'] > 0.1)
].sort_values(by='binding_median_E3',ascending=False)
display(efnb2_bad_efnb3_good)
find_biggest_differences(df_binding_effect_merge)
| site | wildtype | mutant | binding_median_E2 | binding_std_E2 | times_seen_binding_E2 | effect_E2 | effect_std_E2 | times_seen_cell_entry_E2 | frac_models_E2 | binding_median_E3 | binding_std_E3 | times_seen_binding_E3 | effect_E3 | effect_std_E3 | times_seen_cell_entry_E3 | frac_models_E3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 206 | 86 | A | L | 1.478 | 1.086 | 2.00 | -1.627 | 0.516 | 3.250 | 1.0 | -0.274 | 0.432 | 2.0 | -0.320 | 0.838 | 2.000 | 1.0 |
| 496 | 120 | I | M | 1.363 | 0.837 | 5.50 | -0.638 | 0.447 | 6.750 | 1.0 | -0.357 | 0.806 | 4.0 | -0.197 | 0.222 | 5.714 | 1.0 |
| 1729 | 223 | A | D | 1.349 | 1.474 | 5.75 | -1.337 | 0.410 | 10.000 | 1.0 | -0.212 | 0.708 | 4.5 | -1.718 | 0.397 | 7.429 | 1.0 |
| 948 | 170 | E | T | 1.312 | 0.886 | 4.25 | -0.314 | 0.668 | 4.250 | 1.0 | -0.136 | 0.235 | 4.0 | -0.697 | 0.573 | 5.000 | 1.0 |
| 1099 | 179 | S | P | 1.268 | 0.727 | 5.25 | -1.972 | 0.796 | 5.750 | 1.0 | -0.102 | 0.188 | 5.5 | -1.414 | 0.590 | 7.143 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4882 | 446 | D | F | 0.104 | 0.201 | 5.25 | 0.361 | 0.280 | 7.125 | 1.0 | -0.204 | 0.102 | 5.0 | 0.511 | 0.238 | 5.286 | 1.0 |
| 1728 | 223 | A | C | 0.103 | 0.426 | 4.50 | 0.226 | 0.252 | 7.250 | 1.0 | -0.145 | 0.129 | 5.0 | -0.063 | 0.337 | 5.429 | 1.0 |
| 4939 | 449 | G | P | 0.101 | 0.134 | 3.50 | -0.342 | 0.453 | 4.500 | 1.0 | -0.329 | 0.368 | 2.5 | 0.140 | 0.616 | 3.571 | 1.0 |
| 6428 | 562 | I | E | 0.101 | 0.279 | 2.75 | -0.222 | 0.478 | 5.500 | 1.0 | -0.339 | 0.506 | 3.0 | 0.307 | 0.441 | 4.000 | 1.0 |
| 6873 | 596 | K | G | 0.101 | 0.808 | 6.00 | 0.309 | 0.284 | 5.750 | 1.0 | -0.277 | 0.027 | 4.5 | 0.390 | 0.270 | 6.143 | 1.0 |
382 rows × 17 columns
| site | wildtype | mutant | binding_median_E2 | binding_std_E2 | times_seen_binding_E2 | effect_E2 | effect_std_E2 | times_seen_cell_entry_E2 | frac_models_E2 | binding_median_E3 | binding_std_E3 | times_seen_binding_E3 | effect_E3 | effect_std_E3 | times_seen_cell_entry_E3 | frac_models_E3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5517 | 492 | Q | L | -0.167 | 0.523 | 5.00 | 0.514 | 0.344 | 4.500 | 1.0 | 1.200 | 0.031 | 5.0 | 0.496 | 0.296 | 4.857 | 1.0 |
| 6897 | 598 | P | G | -0.200 | 0.772 | 4.50 | -1.051 | 0.970 | 4.500 | 1.0 | 1.141 | 1.564 | 4.0 | 0.344 | 0.542 | 5.000 | 1.0 |
| 6787 | 588 | I | P | -2.239 | 0.805 | 4.75 | -0.187 | 0.393 | 4.500 | 1.0 | 1.052 | 0.055 | 5.0 | -0.637 | 0.369 | 5.000 | 1.0 |
| 144 | 81 | A | W | -0.744 | 0.376 | 3.00 | -1.744 | 0.344 | 5.375 | 1.0 | 0.921 | 0.131 | 3.0 | -0.974 | 0.723 | 4.143 | 1.0 |
| 504 | 123 | N | G | -0.162 | 0.646 | 3.25 | -1.133 | 0.779 | 4.500 | 1.0 | 0.820 | 0.160 | 2.5 | 0.141 | 0.616 | 4.429 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5677 | 507 | V | M | -1.357 | 0.772 | 6.00 | 0.419 | 0.080 | 7.375 | 1.0 | 0.102 | 0.614 | 4.0 | -0.441 | 0.506 | 5.143 | 1.0 |
| 2329 | 275 | N | V | -1.259 | 0.628 | 3.75 | -0.264 | 0.370 | 6.750 | 1.0 | 0.101 | 0.378 | 4.0 | -0.121 | 0.246 | 4.143 | 1.0 |
| 5811 | 518 | N | E | -0.151 | 0.285 | 5.75 | 0.295 | 0.370 | 7.500 | 1.0 | 0.101 | 0.200 | 6.5 | 0.225 | 0.386 | 5.286 | 1.0 |
| 3066 | 324 | K | S | -0.203 | 0.732 | 4.00 | -0.328 | 0.680 | 5.500 | 1.0 | 0.101 | 0.024 | 4.0 | 0.217 | 0.197 | 4.571 | 1.0 |
| 6477 | 567 | L | F | -0.313 | 0.242 | 12.25 | 0.335 | 0.264 | 15.500 | 1.0 | 0.101 | 0.152 | 12.5 | 0.102 | 0.302 | 15.570 | 1.0 |
219 rows × 17 columns
Find correlations between EFNB2 and EFNB3 binding¶
In [13]:
def plot_entry_binding_corr(df):
chart = alt.Chart(df,title='Correlation Between Ephrin Binding Scores').mark_rect().encode(
x=alt.X('binding_median_E2',title='EFNB2 binding',axis=alt.Axis(values=[-5,0,2])).bin(maxbins=40),
y=alt.Y('binding_median_E3',title='EFNB3 binding',axis=alt.Axis(values=[-2,0,2])).bin(maxbins=40),
color=alt.Color('count()',title='Count').scale(scheme='greenblue'),
).properties(
height=200,
width=200,
)
return chart
entry_binding_corr_heatmap_1 = plot_entry_binding_corr(df_binding_effect_merge)
entry_binding_corr_heatmap_1.display()
if entry_binding_combined_corr_plot is not None:
entry_binding_corr_heatmap_1.save(binding_corr_heatmap)
In [14]:
def plot_affinity_solid(df):
#slider = alt.binding_range(min=1, max=20, step=1, name="times_seen")
#selector = alt.param(name="SelectorName", value=1, bind=slider)
df = df.dropna()
# calculate r value
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(df['binding_median_E2'], df['binding_median_E3'])
r_value = float(r_value)
# make chart
chart = alt.Chart(df,title=alt.Title('Correlation between Mutant Binding Scores',subtitle=f'r={r_value:.2f}')).mark_point(color='black',size=30, opacity=0.2,filled=True).encode(
x=alt.X('binding_median_E2', title=('EFNB2 Binding')),
y=alt.Y('binding_median_E3', title=('EFNB3 Binding')),
tooltip=['site', 'wildtype','mutant','binding_median_E2','binding_median_E3','effect_E2','effect_E3'],
).properties(
width=200,
height=200
)
min = int(df['binding_median_E2'].min())
max = int(df['binding_median_E3'].max())
text = alt.Chart({'values':[{'x': min, 'y': max, 'text': f'r = {r_value:.2f}'}]}).mark_text(
align='left', baseline='top', dx=-10, dy=-20).encode(
x=alt.X('x:Q'),
y=alt.Y('y:Q'),
text='text:N'
)
chart_and_text = chart
return chart_and_text
E2_E3_corr = plot_affinity_solid(df_binding_effect_merge)
E2_E3_corr.display()
if entry_binding_combined_corr_plot is not None:
E2_E3_corr.save(E2_E3_correlation)
Plot correlations between summary statistics for each site¶
In [15]:
def plot_affinity_solid_mean(df):
df = df.dropna()
means = df.groupby('site').agg({
'effect_E2': 'median',
'effect_E3': 'median',
'binding_median_E2': 'median',
'binding_median_E3': 'median',
'wildtype': 'first'
}).reset_index()
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(means['binding_median_E2'], means['binding_median_E3'])
r_value = float(r_value)
chart = alt.Chart(means,title=alt.Title('Correlation between Aggregate Mutant Binding Scores',subtitle=f'r={r_value:.2f}')).mark_point(size=50, color='black', opacity=0.3,filled=True).encode(
x=alt.X('binding_median_E2', title=('Median Ephrin-B2 Binding'), axis=alt.Axis(tickCount=3)),
y=alt.Y('binding_median_E3', title=('Median Ephrin-B3 Binding'), axis=alt.Axis(tickCount=3)),
tooltip=['site', 'wildtype','binding_median_E2','binding_median_E3','effect_E2','effect_E3'],
).properties(
width=200,
height=200
)
text = alt.Chart({'values':[{'x': -3.5, 'y': 0.5, 'text': f'r = {r_value:.2f}'}]}).mark_text(
align='left', baseline='top', dx=0, dy=-10).encode(
x=alt.X('x:Q'),
y=alt.Y('y:Q'),
text='text:N'
)
chart_and_text = chart #+ text
return chart_and_text#.display()
E2_E3_site_corr = plot_affinity_solid_mean(df_binding_effect_merge)
E2_E3_site_corr.display()
if entry_binding_combined_corr_plot is not None:
E2_E3_site_corr.save(E2_E3_correlation_site)
if entry_binding_combined_corr_plot is not None:
(E2_E3_site_corr | E2_E3_corr).save(combined_E2_E3_site_corr)
Make plot showing binding by site (median)¶
In [16]:
def plot_affinity_by_site_median(df):
variant_selector = alt.selection_point(
on="mouseover",
empty=False,
fields=["site"],
value=0
)
empty_charts = []
for selection in ['binding_median_E2','binding_median_E3']:
if selection == 'binding_median_E2':
name = 'EFNB2 Binding'
else:
name = 'EFNB3 Binding'
mean = df.groupby('site')[selection].sum().reset_index()
chart = alt.Chart(mean).mark_point(size=60, color='black', stroke='black',filled=True).encode(
x=alt.X('site', title=('Site'), axis=alt.Axis(grid=True, tickCount=4),scale=alt.Scale(domain=[70,602])),
y=alt.Y(selection, title=(name), axis=alt.Axis(grid=True, tickCount=4)),
tooltip=['site'],
color=alt.condition(variant_selector, alt.value('orange'), alt.value('black')),
opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.4)),
strokeWidth=alt.condition(variant_selector,alt.value(2),alt.value(0))
).properties(
width=500,
height=100
).add_params(variant_selector)
empty_charts.append(chart)
combined_chart = alt.vconcat(*empty_charts, spacing=1,title='Summed Binding by Site')
return combined_chart
binding_by_site = plot_affinity_by_site_median(df_binding_effect_merge)
binding_by_site.display()
if entry_binding_combined_corr_plot is not None:
binding_by_site.save(binding_by_site_plot)
In [17]:
def plot_affinity_by_contact_site(df,sites_to_show,title_text):
variant_selector = alt.selection_point(
on="mouseover",
nearest=True,
empty=False,
fields=["site"],
value=0
)
empty_charts = []
contact_df = df[df['site'].isin(sites_to_show)]
sites = list(contact_df['site'].unique())
for selection in df['selection'].unique():
tmp_df = contact_df[contact_df['selection'] == selection]
mean = tmp_df.groupby('site')['binding_median'].max().reset_index()
chart = alt.Chart(mean).mark_point(size=60, color='black', stroke='black',filled=True,opacity=1).encode(
x=alt.X('site:O', sort=sites,title=('Site'), axis=alt.Axis(grid=True, labelAngle=-90),scale=alt.Scale(domain=sites)),
y=alt.Y('binding_median', title=(f'{selection}'), axis=alt.Axis(grid=True)),
tooltip=['site'],
color=alt.condition(variant_selector, alt.value('orange'), alt.value('black')),
#opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.4)),
strokeWidth=alt.condition(variant_selector,alt.value(2),alt.value(0))
).properties(
width=500,
height=100
).add_params(variant_selector)
empty_charts.append(chart)
combined_chart = alt.vconcat(*empty_charts, spacing=1,title=title_text)
return combined_chart
contact_binding_by_site = plot_affinity_by_contact_site(df_binding_effect_concat,config['contact_sites'],'Max Binding in Contact')
contact_binding_by_site.display()
if entry_binding_combined_corr_plot is not None:
contact_binding_by_site.save(max_binding_in_contact)
contact_binding_by_site_stalk = plot_affinity_by_contact_site(df_binding_effect_concat,list(range(96, 147)),"Max Binding in Stalk")
contact_binding_by_site_stalk.display()
if entry_binding_combined_corr_plot is not None:
contact_binding_by_site_stalk.save(max_binding_in_stalk)
Make bubble plots for binding in different areas of receptor pocket¶
In [18]:
def make_boxplot_binding_region(df,title):# Create a box plot using Altair for aggregated means
barrel_ranges = {
'Hydrophobic': config['hydrophobic'],
'Salt Bridges': config['salt_bridges'],
'Hydrogen Bonds': config['h_bond_total'],
'Contact': config['contact_sites'],
'Overall': list(range(71,602)),
}
mean_df = df.groupby('site')[['binding_median']].median().reset_index()
custom_order = ['Hydrophobic','Salt Bridges','Hydrogen Bonds','Contact','Overall']
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = mean_df[mean_df['site'].isin(sites)]
for _, row in subset.iterrows():
agg_means.append({'barrel': barrel, 'effect': row['binding_median'],'site':row['site']})
agg_means_df = pd.DataFrame(agg_means)
chart = alt.Chart(agg_means_df).mark_point(filled=True,size=70,opacity=0.4,color='black').encode(
x=alt.X('barrel:O', sort=custom_order,title=None,axis=alt.Axis(labelAngle=-90)),
y=alt.Y('effect',title=f'Median {title} Binding',axis=alt.Axis(grid=True,tickCount=4)),
xOffset='random:Q',
#color = alt.Color('barrel').legend(None),
tooltip=['barrel', 'effect','site'],
).transform_calculate(
random="sqrt(-1*log(random()))*cos(2*PI*random())"
).properties(
height=alt.Step(20),
width=alt.Step(25)
)
return chart.display()
make_boxplot_binding_region(df_E2_filter,'EFNB2')
make_boxplot_binding_region(df_E3_filter,'EFNB3')
make boxplot of binding scores by region¶
In [19]:
def make_boxplot_binding_region(df):
barrel_ranges = {
"Stalk": list(range(96, 147)),
"Neck": list(range(148, 165)),
"Linker": list(range(166, 177)),
"Head": list(range(178, 602)),
'Receptor Contact': config['contact_sites'],
"Total": list(range(71, 602)),
}
custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact", "Total"]
empty_charts = []
for selection in df['selection'].unique():
tmp_df = df[df["selection"] == selection]
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = tmp_df[tmp_df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{"region": barrel, "binding_median": row["binding_median"], "site": row["site"]}
)
agg_means_df = pd.DataFrame(agg_means)
chart = (
alt.Chart(agg_means_df, title=f"{selection}")
.mark_boxplot(color="darkgray", extent="min-max", opacity=1)
.encode(
x=alt.X(
"region:O",
sort=custom_order,
title="RBP Region",
axis=alt.Axis(labelAngle=-90),
),
y=alt.Y(
"binding_median",
title=f"Binding",
axis=alt.Axis(grid=True, tickCount=4),
),
tooltip=["region", "binding_median", "site"],
)
.properties(height=200, width=200)
)
empty_charts.append(chart)
combined_effect_chart = alt.hconcat(*empty_charts).resolve_scale(
y="independent", x="shared", color="independent"
)
return combined_effect_chart
entry_region_boxplot = make_boxplot_binding_region(df_binding_effect_concat)
entry_region_boxplot.display()
if entry_binding_combined_corr_plot is not None:
entry_region_boxplot.save(binding_region_boxplot_plot)
In [20]:
def make_bubble_binding_region(df):
barrel_ranges = {
"Stalk": list(range(96, 147)),
"Neck": list(range(148, 165)),
"Linker": list(range(166, 177)),
"Head": list(range(178, 602)),
'Receptor Contact': config['contact_sites'],
"Total": list(range(71, 602)),
}
custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact", "Total"]
empty_charts = []
for selection in df['selection'].unique():
tmp_df = df[df["selection"] == selection]
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = tmp_df[tmp_df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{"region": barrel, "binding_median": row["binding_median"], "site": row["site"],"mutant": row["mutant"]}
)
agg_means_df = pd.DataFrame(agg_means)
variant_selector = alt.selection_point(
on="mouseover", empty=False, fields=["site",'mutant'], value=1
)
chart = (
alt.Chart(agg_means_df, title=f"{selection}")
.mark_point(color="black", filled=True, opacity=0.3, stroke='black')
.encode(
x=alt.X(
"region:O",
sort=custom_order,
title="RBP Region",
axis=alt.Axis(labelAngle=-90),
),
y=alt.Y(
"binding_median",
title=f"Binding",
axis=alt.Axis(grid=True, tickCount=4),
),
xOffset="random:Q",
tooltip=["region", "binding_median", "site","mutant"],
color=alt.condition(
variant_selector, alt.value("orange"), alt.value("black")
),
opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.2)),
strokeWidth=alt.condition(variant_selector,alt.value(2),alt.value(0)),
size=alt.condition(variant_selector,alt.value(50),alt.value(25)),
).transform_calculate(
random="sqrt(-1*log(random()))*cos(2*PI*random())"
).properties(height=200, width=200)
).add_params(variant_selector)
empty_charts.append(chart)
combined_effect_chart = alt.hconcat(*empty_charts).resolve_scale(
y="independent", x="shared", color="independent"
).add_params(variant_selector)
return combined_effect_chart
entry_region_bubble = make_bubble_binding_region(df_binding_effect_concat)
entry_region_bubble.display()
if entry_binding_combined_corr_plot is not None:
entry_region_bubble.save(binding_region_bubble_plot)
Lets make some interactive plots¶
In [21]:
def plot_corr_binding_entry_interactive(df):
interval=alt.selection_interval()
chart = alt.Chart(df).mark_point(filled=True).encode(
x=alt.X('effect', title='Cell Entry', axis=alt.Axis(grid=True)),
y=alt.Y('binding_median', title='Binding', axis=alt.Axis(grid=True)),
color=alt.condition(interval,alt.value('orange'),alt.value('gray')),
tooltip=['site', 'wildtype', 'mutant','binding_median','times_seen_binding','effect'],
).add_params(interval)
hist=alt.Chart(df).mark_bar().encode(
y='count()',
x='site',
tooltip=['site']
).transform_filter(
interval
)
combined = chart & hist
return combined
entry_binding_corr_plot = plot_corr_binding_entry_interactive(df_E2_filter)
entry_binding_corr_plot.display()
In [ ]: